1 Preparations

1.1 Status

We are close to something acceptable

1.2 Notation

  • \(D\) Treatment indicator (binary or multiarm)
  • \(y\) outcome
  • \(X\) features/controls

1.3 Seed

rm(list=ls())
seed<-1909

1.4 Libraries

# loading & modifying data
library("readr")         # to read the data
library("dplyr")         # to manipulate data
library("fastDummies")   # create dummies
# charts & tables
library("ggplot2")       # to create charts
library("patchwork")     # to combine charts
library("flextable")     # design tables
library("modelsummary")  # structure tables
library("kableExtra")    # design table
library("estimatr")
library("ggpubr")
# regression & analysis
library("fixest")        # high dimensional FE
library("skimr")         # skim the data
# machine learning
library("policytree")    # policy tree (Athey & Wager, 2021)
library("grf")           # causal forest
library("rsample")       # data splitting 
library("randomForest")  # Traditional Random Forests
library("mlr3")          # learners
library("mlr3learners")  # learners
library("gbm")           # Generalized Boosted Regression
library("DoubleML")      # Double ML

1.5 Load and prepare data

1.5.1 Load data

# load full dataset

df_repl<-read_delim("../data/FARS-data-full-sample.txt",delim = "\t")%>%
              filter(year<2004)%>%
              select(-starts_with("imp"))
# load small dataset
df_sel<-read_delim("../data/FARS-data-selection-sample.txt",delim = "\t")%>%
              filter(year<2004)%>%
              select(-starts_with("imp"))
# remove rows with missing cases
df_repl<-df_repl[complete.cases(df_repl), ]
df_sel<-df_sel[complete.cases(df_sel), ]

# print number of obs
print(paste('Number of observations in the data:',nrow(df_repl),' (full sample);',nrow(df_sel), ' (selected/causal sample)'))
## [1] "Number of observations in the data: 38455  (full sample); 10328  (selected/causal sample)"

1.5.2 Manipulate data

# Treatment indicators
df_repl<-df_repl%>%mutate(D=case_when(lapshould==1~"LapShoulderSeat",lapbelt==1~"Lapbelt",
                                      childseat==1~"Childseat",TRUE~"NONE"),
                          D=factor(D,levels=c("NONE","Lapbelt","LapShoulderSeat","Childseat")),
                          Dbinary=case_when(lapshould==1~1,lapbelt==1~1,childseat==1~1,TRUE~0),
                         car_age=year-modelyr)
df_sel <-df_sel %>%mutate(D=case_when(lapshould==1~"LapShoulderSeat",lapbelt==1~"Lapbelt",
                                    childseat==1~"Childseat",TRUE~"NONE"),
                         D=factor(D,levels=c("NONE","Lapbelt","LapShoulderSeat","Childseat")),
                         Dbinary=case_when(lapshould==1~1,lapbelt==1~1,childseat==1~1,TRUE~0),
                         car_age=year-modelyr)
# Convert categorical to indicators
df_repl<-dummy_cols(df_repl%>%select(-restraint))%>%select(-starts_with("D_"),-crashtm,-crashcar,-age,-vehicles1,-vehicles2)
df_sel<-dummy_cols(df_sel%>%select(-restraint))%>%select(-starts_with("D_"),-crashtm,-crashcar,-age,-vehicles1,-vehicles2)
#df_repl<-df_repl%>%mutate(day=ifelse(crashtm=="1_day",1,0),night=ifelse(crashtm=="2_night",1,0),morn=ifelse(crashtm=="3_morn",1,0))
#df_sel<- df_sel %>%mutate(day=ifelse(crashtm=="1_day",1,0),night=ifelse(crashtm=="2_night",1,0),morn=ifelse(crashtm=="3_morn",1,0))
# Select variables
#df_repl<-df_repl%>%select(splmU55,thoulbs_I,numcrash,weekend,lowviol,highviol,ruralrd,frimp,suv,death,D,Dbinary,modelyr,age,year,car_age)
#df_sel<- df_sel %>%select(splmU55,thoulbs_I,numcrash,weekend,lowviol,highviol,ruralrd,frimp,suv,death,D,Dbinary,modelyr,age,year,car_age)
# Training and test data
set.seed(seed)
df_repl_split <- initial_split(df_repl, prop = .5)
df_repl_train <- training(df_repl_split)
df_repl_test  <- testing(df_repl_split)
df_sel_split <- initial_split(df_sel, prop = .5)
df_sel_train <- training(df_sel_split)
df_sel_test  <- testing(df_sel_split)
# OVerride settings above
df_repl_train <- df_repl
df_repl_test  <- df_repl
df_sel_train <- df_sel
df_sel_test  <- df_sel

# X Matrices
X_repl_train<-as.matrix(df_repl_train%>%select(-death,-D,-Dbinary, -childseat,-lapbelt,-lapshould))
X_repl_test<- as.matrix(df_repl_test%>%select(-death,-D,-Dbinary, -childseat,-lapbelt,-lapshould))
X_sel_train<- as.matrix(df_sel_train%>%select(-death,-D,-Dbinary, -childseat,-lapbelt,-lapshould))
X_sel_test<-  as.matrix(df_sel_test%>%select(-death,-D,-Dbinary, -childseat,-lapbelt,-lapshould))
X_repl_train_nocontrols<-as.matrix(rep(1,nrow(X_repl_train)))
X_repl_test_nocontrols<- as.matrix(rep(1,nrow(X_repl_test)))
X_sel_train_nocontrols<- as.matrix(rep(1,nrow(X_sel_train)))
X_sel_test_nocontrols<-  as.matrix(rep(1,nrow(X_sel_test)))
# D matrices
D_repl_train<-factor(df_repl_train$D,levels=c("NONE","Lapbelt","LapShoulderSeat","Childseat"))
D_repl_test<-factor(df_repl_train$D,levels=c("NONE","Lapbelt","LapShoulderSeat","Childseat"))
D_sel_train<-factor(df_sel_train$D,levels=c("NONE","Lapbelt","LapShoulderSeat","Childseat"))
D_sel_test<-factor(df_sel_train$D,levels=c("NONE","Lapbelt","LapShoulderSeat","Childseat"))
D_binary_repl_train<-as.matrix(df_repl_train%>%select(Dbinary))
D_binary_repl_test<- as.matrix(df_repl_test%>%select(Dbinary))
D_binary_sel_train<- as.matrix(df_sel_train%>%select(Dbinary))
D_binary_sel_test<-  as.matrix(df_sel_test%>%select(Dbinary))
# Y matrices
Y_repl_train<-as.matrix(df_repl_train%>%select(death))
Y_repl_test<- as.matrix(df_repl_test%>%select(death))
Y_sel_train<- as.matrix(df_sel_train%>%select(death))
Y_sel_test<-  as.matrix(df_sel_test%>%select(death))

2 Causal Forest out of the box

2.1 Estimate and print AIPW ATE

#set.seed(seed)
#Y.forest_het = regression_forest(X=X_sel_train, Y_sel_train)
#Y.hat_het = predict(Y.forest_het )$predictions
#set.seed(seed)
#W.forest_het = regression_forest(X=X_sel_train, D_binary_sel_train)
#W.hat_het = predict(W.forest_het )$predictions

# Estimate forest
set.seed(seed)

cfbinary<- causal_forest(X=X_sel_train, Y=Y_sel_train, W=D_binary_sel_train,tune.parameters = "all")
average_treatment_effect(cfbinary)
##     estimate      std.err 
## -0.044536725  0.007602517

2.2 Diagnostic test

test_calibration(cfbinary)
## 
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
## 
##                                Estimate Std. Error t value    Pr(>t)    
## mean.forest.prediction          0.95373    0.12231  7.7979 3.450e-15 ***
## differential.forest.prediction  1.08311    0.28984  3.7370 9.363e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.3 Influential features

# Get importance
importance=variable_importance(cfbinary)

var_imp <- data.frame(importance=importance,names=colnames(X_sel_train))
ggplot(var_imp,aes(x= reorder(names,importance),y=importance))+
  geom_bar(stat="identity",fill="#f56c42",color="white")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle=45,vjust = 1, hjust=1))+
  labs(x=" ")+
  coord_flip()

## CATE distribution

# get predictions
cate<-data.frame(sample="CATEs",tau=predict(cfbinary)$predictions)
# histogram all
ggplot(cate,aes(x=tau))+
   geom_histogram(aes(y=..count../sum(..count..)),bins=100,alpha=0.95, position = "identity",
                  fill="#f56c42",color="white")+
  theme_minimal()+
  labs(title=" ",x="Conditional Average Treatment Effect",y="Density")

2.4 Plot quartiles of CATE and corresponding AIPW

# Split sample in 5 groups based on cates
df_sel_train["categroup"] <- factor(ntile(predict(cfbinary)$predictions, n=4))
# calculate AIPW for each sub group
estimated_aipw_ate <- lapply(
  seq(4), function(w) {
  ate <- average_treatment_effect(cfbinary, subset = df_sel_train$categroup == w,method = "AIPW")
})
# Combine in data da frame
estimated_aipw_ate <- data.frame(do.call(rbind, estimated_aipw_ate))
estimated_aipw_ate$Ntile <- as.numeric(rownames(estimated_aipw_ate))
estimated_aipw_ate$type<-"AIPW"
# Mean of CATES
df_sel_train["cate"]<-predict(cfbinary)$predictions
cates<-df_sel_train%>%group_by(categroup)%>%summarise(estimate=mean(cate))%>%rename(Ntile=categroup)%>%mutate(std.err=NA,type="CATE")

dfplot<-rbind(estimated_aipw_ate,cates)
# create plot
ggplot(dfplot,aes(color=type)) +
  geom_pointrange(aes(x = Ntile, y = estimate, ymax = estimate + 1.96 * `std.err`, ymin = estimate - 1.96 * `std.err`), 
                  size = 1,
                  position = position_dodge(width = .5)) +
  theme_minimal() +
  geom_hline(yintercept=0,linetype="dashed")+
  labs(x = "Quartile", y = "AIPW ATE", title = "AIPW ATEs by  quartiles of the conditional average treatment effect")

2.5 CATES by by covariates

df_sel_train["tau"]<-predict(cfbinary)$predictions
df_sel_train_col<-df_sel_train%>%
  group_by(modelyr,splmU55)%>%
  summarise(tau=mean(tau))
p1<-ggplot(df_sel_train_col,aes(x=modelyr,y=tau,color=as.factor(splmU55)))+geom_point()+
  ylim(-0.125,0)
df_sel_train_col<-df_sel_train%>%
  group_by(year,splmU55)%>%
  summarise(tau=mean(tau))
p2<-ggplot(df_sel_train_col,aes(x=year,y=tau,color=as.factor(splmU55)))+geom_point()+
  ylim(-0.125,0)+labs(y="")+  theme(axis.title.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())
df_sel_train_col<-df_sel_train%>%
  group_by(thoulbs_I)%>%
  summarise(tau=mean(tau))
p3<-ggplot(df_sel_train_col,aes(x=thoulbs_I*1000,y=tau))+geom_point()+
  ylim(-0.125,0)+labs(y="")+  theme(axis.title.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())
ggarrange(p1, p2, p3, ncol=3, nrow=1, common.legend = TRUE, legend="bottom")

2.6 CATE distribution by speed limit

# get predictions
cate<-data.frame(sample="CATEs",splmU55=df_sel_train$splmU55,tau=predict(cfbinary)$predictions)
# histogram all
ggplot(cate,aes(x=tau,fill=as.factor(splmU55),group=splmU55))+
   geom_histogram(aes(y=..count../sum(..count..)),bins=100,alpha=0.5, position = "identity",
                 color="white")+
  theme_minimal()+
  labs(title=" ",x="Conditional Average Treatment Effect",y="Density")

3 A grid (or manual tuning)

3.1 Default parameters

cfbinary$tuning.output               
## Tuning status: default.
## This indicates tuning was attempted. However, we could not find parameters that were expected to perform better than default: 
## 
## sample.fraction: 0.5
##  mtry: 26
##  min.node.size: 5
##  honesty.fraction: 0.5
##  honesty.prune.leaves: TRUE
##  alpha: 0.05
##  imbalance.penalty: 0

3.2 Our grid

min.node.size<-c(5,50,100,500)
mtry<-c(26,5)
num.trees<-c(2000,200,4000)
honesty.fraction<-c(0.5,0.25,0.75)
alpha<-c(0.05,0.01,0.1)
sample<-c(.99,0.5)

mygrid<-expand.grid(sample,min.node.size,mtry,num.trees,honesty.fraction,alpha)
colnames(mygrid) <-c("sample","min.node.size","mtry","num.trees","honesty.fraction","alpha")

3.3 Estimate on grid and print results

# Add empty columns to grid
mygrid["ATE"]<-NA
mygrid["mean.forest.prediction"]<-NA
mygrid["differential.forest.prediction"]<-NA
catedist<-vector("list",nrow(mygrid))
# loop over grid
for (i in 1:nrow(mygrid)){
  
# load small dataset
df_sel<-read_delim("../data/FARS-data-selection-sample.txt",delim = "\t")%>%
              filter(year<2004)%>%
              select(-starts_with("imp"))
# remove rows with missing cases
df_sel<-df_sel[complete.cases(df_sel), ]
# Treatment indicators
df_sel <-df_sel %>%mutate(D=case_when(lapshould==1~"LapShoulderSeat",lapbelt==1~"Lapbelt",
                                    childseat==1~"Childseat",TRUE~"NONE"),
                         D=factor(D,levels=c("NONE","Lapbelt","LapShoulderSeat","Childseat")),
                         Dbinary=case_when(lapshould==1~1,lapbelt==1~1,childseat==1~1,TRUE~0),
                         car_age=year-modelyr)
# Convert categorical to indicators
df_sel<-dummy_cols(df_sel%>%select(-restraint))%>%select(-starts_with("D_"),-crashtm,-crashcar,-age,-vehicles1,-vehicles2)
# Training and test data
set.seed(seed)
df_sel_split <- initial_split(df_sel, prop =as.numeric(mygrid[i,]["sample"]))
df_sel_train <- training(df_sel_split)
# X Matrices
X_sel_train<- as.matrix(df_sel_train%>%select(-death,-D,-Dbinary, -childseat,-lapbelt,-lapshould))
# D matrices
D_binary_sel_train<- as.matrix(df_sel_train%>%select(Dbinary))
# Y matrices
Y_sel_train<- as.matrix(df_sel_train%>%select(death))

# Estimate causal foreswt
cf_facility_grid <- causal_forest(X_sel_train, Y=Y_sel_train, W=D_binary_sel_train,
      num.trees = as.numeric(mygrid[i,]["num.trees"]),
      min.node.size = as.numeric(mygrid[i,]["min.node.size"]),
      honesty.fraction = as.numeric(mygrid[i,]["honesty.fraction"]),
      mtry = as.numeric(mygrid[i,]["mtry"]),
      alpha = as.numeric(mygrid[i,]["alpha"])
      )
  # store treatment effect 
  te<-average_treatment_effect(cf_facility_grid)
  mygrid[i,"ATE"]<-te[1]
  #  store calibration test
  tc <- test_calibration(cf_facility_grid)
  mygrid[i,"mean.forest.prediction"]<-tc[1,1]
  mygrid[i,"differential.forest.prediction"]<-tc[2,1]
  # store chart of CATES
     # get predictions
     cate<-data.frame(sample="CATEs",splmU55=df_sel_train$splmU55,tau=predict(cf_facility_grid)$predictions)%>%
       mutate(SPLMU55=ifelse(splmU55==1," 1", "0"))
     # histogram all
     p1<-ggplot(cate,aes(x=tau,color=as.factor(SPLMU55),fill=as.factor(SPLMU55),group=SPLMU55))+
       geom_histogram(aes(y=..count../sum(..count..)),bins=100,alpha=0.75, position = "identity",
                     size=0.1,color="white")+theme(text=element_text(size=6))+
      theme_minimal()+theme(legend.position = "top")+
      labs(fill="splmU55",x="cate",y=" ",title=paste("Spec: ",i))
    catedist[[i]]<-p1

}

3.4 Plot results on chart

main<-ggplot(mygrid)+geom_point(aes(x=mean.forest.prediction,y=differential.forest.prediction,color=abs(ATE)))+
      theme_minimal()+labs(title="BLP omnibus test across specifications" ) 
inset<-ggplot(mygrid)+geom_point(aes(x=mean.forest.prediction,y=differential.forest.prediction,color=abs(ATE)))+
  theme_minimal()+theme(panel.background =  element_rect(fill = "white"))+
        xlim(0.5,2)+ylim(-0.5,2)+labs(title="Zoomed")

ggpubr::ggarrange(plotlist=list(main,inset),common.legend = TRUE)

3.5 Plot charts by grid

ggpubr::ggarrange(plotlist=catedist,common.legend = TRUE,ncol=4)
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## $`9`

## 
## $`10`

## 
## $`11`

## 
## $`12`

## 
## $`13`

## 
## $`14`

## 
## $`15`

## 
## $`16`

## 
## $`17`

## 
## $`18`

## 
## $`19`

## 
## $`20`

## 
## $`21`

## 
## $`22`

## 
## $`23`

## 
## $`24`

## 
## $`25`

## 
## $`26`

## 
## $`27`

## 
## $`28`

## 
## $`29`

## 
## $`30`

## 
## $`31`

## 
## $`32`

## 
## $`33`

## 
## $`34`

## 
## $`35`

## 
## $`36`

## 
## $`37`

## 
## $`38`

## 
## $`39`

## 
## $`40`

## 
## $`41`

## 
## $`42`

## 
## $`43`

## 
## $`44`

## 
## $`45`

## 
## $`46`

## 
## $`47`

## 
## $`48`

## 
## $`49`

## 
## $`50`

## 
## $`51`

## 
## $`52`

## 
## $`53`

## 
## $`54`

## 
## $`55`

## 
## $`56`

## 
## $`57`

## 
## $`58`

## 
## $`59`

## 
## $`60`

## 
## $`61`

## 
## $`62`

## 
## $`63`

## 
## $`64`

## 
## $`65`

## 
## $`66`

## 
## $`67`

## 
## $`68`

## 
## $`69`

## 
## $`70`

## 
## $`71`

## 
## $`72`

## 
## $`73`

## 
## $`74`

## 
## $`75`

## 
## $`76`

## 
## $`77`

## 
## $`78`

## 
## $`79`

## 
## $`80`

## 
## $`81`

## 
## $`82`

## 
## $`83`

## 
## $`84`

## 
## $`85`

## 
## $`86`

## 
## $`87`

## 
## $`88`

## 
## $`89`

## 
## $`90`

## 
## $`91`

## 
## $`92`

## 
## $`93`

## 
## $`94`

## 
## $`95`

## 
## $`96`

## 
## $`97`

## 
## $`98`

## 
## $`99`

## 
## $`100`

## 
## $`101`

## 
## $`102`

## 
## $`103`

## 
## $`104`

## 
## $`105`

## 
## $`106`

## 
## $`107`

## 
## $`108`

## 
## attr(,"class")
## [1] "list"      "ggarrange"